library(tidyverse)
library(gtable)
library(gtsummary)
library(gridExtra)
library(knitr)
library(car)
library(boot)
data <- readRDS("Dataset.RData")
melanoma <- readRDS("Dataset.RData") %>%
mutate(age_c = mean(age)-age,
trans_class_bin = ifelse(trans_class == "immune",1,0),
prop_infiltrated = infiltration_count/tile_count,
stage_bin = ifelse(stage == 3, "3",
ifelse(stage == 4, NA, "1-2"))) %>%
filter(!is.na(stage_bin))
data <- melanoma
N = length(melanoma$id)
data$y = 0
data$y[data$survival == 'Long'] = 1
# immune selected genes MC1R, AKT1, NAMPT, TNFRSF19, PAK2, SOS1, B2M
l1 <- glm(data$y ~ data$MC1R + as.factor(data$stage), family = 'binomial')
l1$coefficients[2]
## data$MC1R
## -0.9632995
l2 <- glm(data$y ~ data$AKT1 + as.factor(data$stage), family = 'binomial')
l2$coefficients[2]
## data$AKT1
## -0.917543
l3 <- glm(data$y ~ data$NAMPT + as.factor(data$stage), family = 'binomial')
l3$coefficients[2]
## data$NAMPT
## 0.1939064
l4 <- glm(data$y ~ data$TNFRSF19 + as.factor(data$stage), family = 'binomial')
l4$coefficients[2]
## data$TNFRSF19
## 0.23159
l5 <- glm(data$y ~ data$PAK2 + as.factor(data$stage), family = 'binomial')
l5$coefficients[2]
## data$PAK2
## 0.5984475
l6 <- glm(data$y ~ data$SOS1 + as.factor(data$stage), family = 'binomial')
l6$coefficients[2]
## data$SOS1
## 0.2604371
l7 <- glm(data$y ~ data$B2M + as.factor(data$stage), family = 'binomial')
l7$coefficients[2]
## data$B2M
## 0.3445805
betas <- unname(c(l1$coefficients[2], l2$coefficients[2], l3$coefficients[2], l4$coefficients[2], l5$coefficients[2], l6$coefficients[2], l7$coefficients[2]))
dd = data[c('MC1R', 'AKT1', 'NAMPT', 'TNFRSF19', 'PAK2', 'SOS1', 'B2M')]
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(ggcorrplot)
library(heatmaply)
## Loading required package: viridis
## Loading required package: viridisLite
##
## ======================
## Welcome to heatmaply version 1.3.0
##
## Type citation('heatmaply') for how to cite the package.
## Type ?heatmaply for the main documentation.
##
## The github page is: https://github.com/talgalili/heatmaply/
## Please submit your suggestions and bug-reports at: https://github.com/talgalili/heatmaply/issues
## You may ask questions at stackoverflow, use the r and heatmaply tags:
## https://stackoverflow.com/questions/tagged/heatmaply
## ======================
heatmaply_cor(cor(dd))
cor(dd)
## MC1R AKT1 NAMPT TNFRSF19 PAK2 SOS1
## MC1R 1.0000000 0.6168210 -0.1632614 -0.1315617 -0.3218287 -0.3567775
## AKT1 0.6168210 1.0000000 -0.2856350 -0.2455591 -0.4477137 -0.3738679
## NAMPT -0.1632614 -0.2856350 1.0000000 0.2330692 0.1857005 0.1142098
## TNFRSF19 -0.1315617 -0.2455591 0.2330692 1.0000000 0.2732336 0.0416091
## PAK2 -0.3218287 -0.4477137 0.1857005 0.2732336 1.0000000 0.2164186
## SOS1 -0.3567775 -0.3738679 0.1142098 0.0416091 0.2164186 1.0000000
## B2M -0.2274429 -0.1892508 0.2003606 -0.0800356 0.1264879 0.1687312
## B2M
## MC1R -0.2274429
## AKT1 -0.1892508
## NAMPT 0.2003606
## TNFRSF19 -0.0800356
## PAK2 0.1264879
## SOS1 0.1687312
## B2M 1.0000000
dd1 = as.data.frame(dd)
dd1$y = data$y
dd1$age_c = data$age - mean(data$age)
dd1$sex = 0
dd1$sex[data$gender == 'female'] = 1
dd1$pathStage = as.factor(data$stage_bin)
model = glm(y ~ MC1R + AKT1 + NAMPT + TNFRSF19 + PAK2 + SOS1 + B2M + pathStage, data = dd1, family = 'binomial')
data.frame(vif(model)[-8], betas) %>%
kable(col.names = c("VIF","$\\hat{\\beta}$"))
| MC1R |
1.182171 |
-0.9632995 |
| AKT1 |
1.390206 |
-0.9175430 |
| NAMPT |
1.127706 |
0.1939064 |
| TNFRSF19 |
1.189646 |
0.2315900 |
| PAK2 |
1.183120 |
0.5984475 |
| SOS1 |
1.183708 |
0.2604371 |
| B2M |
1.098849 |
0.3445805 |
# selected top genes: MC1R, AKT1, PAK2, SOS1, B2M
# model 1: basline + selected top genes
model1 <- glm(y ~ MC1R + AKT1 + PAK2 + sex + age_c + pathStage, data = dd1, family = 'binomial')
summary(model1)
##
## Call:
## glm(formula = y ~ MC1R + AKT1 + PAK2 + sex + age_c + pathStage,
## family = "binomial", data = dd1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2138 -0.9441 0.4158 0.8805 1.9620
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.68720 0.24066 2.856 0.004297 **
## MC1R -0.53926 0.28790 -1.873 0.061055 .
## AKT1 -0.56258 0.23008 -2.445 0.014481 *
## PAK2 0.40056 0.17698 2.263 0.023617 *
## sex -0.22564 0.30450 -0.741 0.458669
## age_c -0.02940 0.01021 -2.880 0.003977 **
## pathStage3 -1.18109 0.31076 -3.801 0.000144 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 330.38 on 238 degrees of freedom
## Residual deviance: 265.65 on 232 degrees of freedom
## AIC: 279.65
##
## Number of Fisher Scoring iterations: 5
model1.diag <- glm.diag(model1)
data.frame("cooks" = model1.diag$cook, "leverage" = model1.diag$h) %>%
pivot_longer(everything(), names_to = "measure") %>%
mutate(cutoff = ifelse(measure=="cooks",1,26/N)) %>%
ggplot(aes(x = seq(0.5,N,by=0.5))) +
geom_line(aes(y = value, col = measure)) +
geom_line(aes(y = cutoff), lty = 2, alpha = 0.5) +
theme_classic() +
facet_wrap(~measure, scales = "free_y", nrow = 2) +
scale_color_manual(values = c("orange","cornflowerblue")) +
labs(x = "observation")

N=239
p_res1 <- residuals(model1, type = "pearson")
d_res1 <- residuals(model1, type = "deviance")
data.frame("pearson" = p_res1, "deviance" = d_res1) %>%
pivot_longer(everything(), names_to = "residual") %>%
ggplot(aes(x = seq(0.5,N,by=0.5), y = value, col = residual)) +
geom_point() +
facet_wrap(~residual, nrow = 2) +
theme_classic() +
scale_color_manual(values = c("darkgreen","purple")) +
labs(x = "observation")

data.frame("pearson" = model1.diag$rp, "deviance" = model1.diag$rd) %>%
pivot_longer(everything(), names_to = "residual") %>%
ggplot(aes(x = seq(0.5,N,by=0.5), y = value, col = residual)) +
geom_point() +
facet_wrap(~residual, nrow = 2) +
theme_classic() +
scale_color_manual(values = c("darkgreen","purple")) +
labs(x = "observation", y = "standardized values")

#kable(data.frame("pearson" = p_res1, "deviance" = d_res1,"leverage" = model1.diag$h, "cooks" = model1.diag$cook,"p_standard" = model1.diag$rp, "d_standard" = model1.diag$rd),col.names = c("$\\hat{r}^{P}$","$\\hat{r}^{D}$","leverage","cooks","$\\hat{r}^{PS}$","$\\hat{r}^{DS}$"))
# model 2: baseline + selected top genes + immune response genes
dd1$CD86 = data$CD86
dd1$DDX58 = data$DDX58
dd1$KLRD1 = data$KLRD1
dd1$TLR4 = data$TLR4
dd1$DCK = data$DCK
dd1$C5 = data$C5
dd1$CCL28 = data$CCL28
dd1$RFX5 = data$RFX5
dd1$EIF2AK2 = data$EIF2AK2
model2 <- glm(y ~ MC1R + AKT1 + PAK2 + sex + age_c + DDX58 + KLRD1 + TLR4 + pathStage, data = dd1, family = 'binomial')
summary(model2)
##
## Call:
## glm(formula = y ~ MC1R + AKT1 + PAK2 + sex + age_c + DDX58 +
## KLRD1 + TLR4 + pathStage, family = "binomial", data = dd1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1888 -0.9154 0.3934 0.8375 2.1739
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.82951 0.25211 3.290 0.00100 **
## MC1R -0.27151 0.29388 -0.924 0.35554
## AKT1 -0.55906 0.23691 -2.360 0.01829 *
## PAK2 0.32341 0.18538 1.745 0.08105 .
## sex -0.23917 0.31225 -0.766 0.44370
## age_c -0.03071 0.01056 -2.908 0.00364 **
## DDX58 0.04621 0.16474 0.281 0.77908
## KLRD1 0.26288 0.18681 1.407 0.15938
## TLR4 0.38050 0.21225 1.793 0.07302 .
## pathStage3 -1.38949 0.33121 -4.195 2.73e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 330.38 on 238 degrees of freedom
## Residual deviance: 256.07 on 229 degrees of freedom
## AIC: 276.07
##
## Number of Fisher Scoring iterations: 5
model2.diag <- glm.diag(model2)
data.frame("cooks" = model2.diag$cook, "leverage" = model2.diag$h) %>%
pivot_longer(everything(), names_to = "measure") %>%
mutate(cutoff = ifelse(measure=="cooks",1,26/240)) %>%
ggplot(aes(x = seq(0.5,N,by=0.5))) +
geom_line(aes(y = value, col = measure)) +
geom_line(aes(y = cutoff), lty = 2, alpha = 0.5) +
theme_classic() +
facet_wrap(~measure, scales = "free_y", nrow = 2) +
scale_color_manual(values = c("orange","cornflowerblue")) +
labs(x = "observation")

p_res2 <- residuals(model2, type = "pearson")
d_res2 <- residuals(model2, type = "deviance")
data.frame("pearson" = p_res2, "deviance" = d_res2) %>%
pivot_longer(everything(), names_to = "residual") %>%
ggplot(aes(x = seq(0.5,N,by=0.5), y = value, col = residual)) +
geom_point() +
facet_wrap(~residual, nrow = 2) +
theme_classic() +
scale_color_manual(values = c("darkgreen","purple")) +
labs(x = "observation")

data.frame("pearson" = model2.diag$rp, "deviance" = model2.diag$rd) %>%
pivot_longer(everything(), names_to = "residual") %>%
ggplot(aes(x = seq(0.5,N,by=0.5), y = value, col = residual)) +
geom_point() +
facet_wrap(~residual, nrow = 2) +
theme_classic() +
scale_color_manual(values = c("darkgoldenrod2","orchid3")) +
labs(x = "observation", y = "standardized values")

#kable(data.frame("pearson" = p_res2, "deviance" = d_res2,"leverage" = model2.diag$h, "cooks" = model2.diag$cook,"p_standard" = model2.diag$rp, "d_standard" = model2.diag$rd),col.names = c("$\\hat{r}^{P}$","$\\hat{r}^{D}$","leverage","cooks","$\\hat{r}^{PS}$","$\\hat{r}^{DS}$"))
# model 3: baseline + selected top genes (2!!!) + immune response genes + age:immune response genes
model3 <- glm(y ~ pathStage*MC1R + pathStage*AKT1 + pathStage*PAK2 + sex + age_c + pathStage*DDX58 + pathStage*KLRD1 + pathStage*TLR4, data = dd1, family = 'binomial')
model4<- glm(y ~ pathStage*MC1R + pathStage*AKT1 + pathStage*PAK2 + sex + age_c, data = dd1, family = 'binomial')
summary(model3)
##
## Call:
## glm(formula = y ~ pathStage * MC1R + pathStage * AKT1 + pathStage *
## PAK2 + sex + age_c + pathStage * DDX58 + pathStage * KLRD1 +
## pathStage * TLR4, family = "binomial", data = dd1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.9111 -0.8781 0.1851 0.8371 2.0644
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.06409 0.30921 3.441 0.000579 ***
## pathStage3 -1.57509 0.39032 -4.035 5.45e-05 ***
## MC1R -0.18903 0.37501 -0.504 0.614220
## AKT1 -0.54072 0.32124 -1.683 0.092331 .
## PAK2 0.51523 0.28225 1.825 0.067931 .
## sex -0.12842 0.32561 -0.394 0.693273
## age_c -0.03258 0.01102 -2.958 0.003100 **
## DDX58 -0.27738 0.20877 -1.329 0.183966
## KLRD1 0.42886 0.44573 0.962 0.335973
## TLR4 0.95109 0.48074 1.978 0.047884 *
## pathStage3:MC1R -0.10106 0.60318 -0.168 0.866944
## pathStage3:AKT1 -0.05622 0.48646 -0.116 0.907988
## pathStage3:PAK2 -0.51131 0.39054 -1.309 0.190461
## pathStage3:DDX58 1.08710 0.45414 2.394 0.016678 *
## pathStage3:KLRD1 -0.23974 0.49227 -0.487 0.626260
## pathStage3:TLR4 -0.98786 0.55348 -1.785 0.074290 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 330.38 on 238 degrees of freedom
## Residual deviance: 244.71 on 223 degrees of freedom
## AIC: 276.71
##
## Number of Fisher Scoring iterations: 6
summary(model4)
##
## Call:
## glm(formula = y ~ pathStage * MC1R + pathStage * AKT1 + pathStage *
## PAK2 + sex + age_c, family = "binomial", data = dd1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3949 -0.9524 0.3473 0.8835 1.9123
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.71413 0.24623 2.900 0.003728 **
## pathStage3 -1.21018 0.33113 -3.655 0.000258 ***
## MC1R -0.49375 0.36101 -1.368 0.171416
## AKT1 -0.51748 0.30760 -1.682 0.092506 .
## PAK2 0.62292 0.26121 2.385 0.017089 *
## sex -0.20928 0.30744 -0.681 0.496045
## age_c -0.02825 0.01030 -2.742 0.006099 **
## pathStage3:MC1R -0.02713 0.59157 -0.046 0.963426
## pathStage3:AKT1 -0.11487 0.46470 -0.247 0.804758
## pathStage3:PAK2 -0.43708 0.36093 -1.211 0.225905
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 330.38 on 238 degrees of freedom
## Residual deviance: 264.15 on 229 degrees of freedom
## AIC: 284.15
##
## Number of Fisher Scoring iterations: 6
anova(model3, model4, test = "LRT")
anova(model1, model2, test = "LRT")
model3.diag <- glm.diag(model3)
data.frame("cooks" = model3.diag$cook, "leverage" = model3.diag$h) %>%
pivot_longer(everything(), names_to = "measure") %>%
mutate(cutoff = ifelse(measure=="cooks",1,26/240)) %>%
ggplot(aes(x = seq(0.5,N,by=0.5))) +
geom_line(aes(y = value, col = measure)) +
geom_line(aes(y = cutoff), lty = 2, alpha = 0.5) +
theme_classic() +
facet_wrap(~measure, scales = "free_y", nrow = 2) +
scale_color_manual(values = c("orange","cornflowerblue")) +
labs(x = "observation")

p_res3 <- residuals(model3, type = "pearson")
d_res3 <- residuals(model3, type = "deviance")
data.frame("pearson" = p_res3, "deviance" = d_res3) %>%
pivot_longer(everything(), names_to = "residual") %>%
ggplot(aes(x = seq(0.5,N,by=0.5), y = value, col = residual)) +
geom_point() +
facet_wrap(~residual, nrow = 2) +
theme_classic() +
scale_color_manual(values = c("darkgreen","purple")) +
labs(x = "observation")

data.frame("pearson" = model3.diag$rp, "deviance" = model3.diag$rd) %>%
pivot_longer(everything(), names_to = "residual") %>%
ggplot(aes(x = seq(0.5,N,by=0.5), y = value, col = residual)) +
geom_point() +
facet_wrap(~residual, nrow = 2) +
theme_classic() +
scale_color_manual(values = c("darkgoldenrod2","orchid3")) +
labs(x = "observation", y = "standardized values")

#kable(data.frame("pearson" = p_res3, "deviance" = d_res3,"leverage" = model3.diag$h, "cooks" = model3.diag$cook,"p_standard" = model3.diag$rp, "d_standard" = model3.diag$rd),col.names = c("$\\hat{r}^{P}$","$\\hat{r}^{D}$","leverage","cooks","$\\hat{r}^{PS}$","$\\hat{r}^{DS}$"))
library(VGAM)
## Loading required package: stats4
## Loading required package: splines
##
## Attaching package: 'VGAM'
## The following objects are masked from 'package:boot':
##
## logit, simplex
## The following object is masked from 'package:car':
##
## logit
## The following object is masked from 'package:tidyr':
##
## fill
# model 5: proportional odds model
dd1$CD86 = data$CD86
dd1$DDX58 = data$DDX58
dd1$KLRD1 = data$KLRD1
dd1$TLR4 = data$TLR4
dd1$pathStage = as.factor(data$stage)
model5 <- vglm(pathStage ~ MC1R + AKT1 + PAK2 + SOS1 + B2M + sex + age_c + CD86 + DDX58 + KLRD1 + TLR4, data = dd1, family = cumulative(parallel=T))
## Warning in eval(slot(family, "initialize")): response should be ordinal---see
## ordered()
summary(model5)
##
## Call:
## vglm(formula = pathStage ~ MC1R + AKT1 + PAK2 + SOS1 + B2M +
## sex + age_c + CD86 + DDX58 + KLRD1 + TLR4, family = cumulative(parallel = T),
## data = dd1)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept):1 -1.196474 0.156455 -7.647 2.05e-14 ***
## (Intercept):2 0.307711 0.139717 2.202 0.0276 *
## MC1R -0.315414 0.144729 -2.179 0.0293 *
## AKT1 0.285869 0.125643 2.275 0.0229 *
## PAK2 0.249782 0.105872 2.359 0.0183 *
## SOS1 0.065107 0.099621 0.654 0.5134
## B2M 0.095804 0.131593 0.728 0.4666
## sex -0.204166 0.186470 -1.095 0.2736
## age_c -0.012211 0.005872 -2.079 0.0376 *
## CD86 -0.323995 0.164611 -1.968 0.0490 *
## DDX58 0.234769 0.100119 2.345 0.0190 *
## KLRD1 0.031465 0.138238 0.228 0.8199
## TLR4 -0.058970 0.117485 -0.502 0.6157
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Names of linear predictors: logitlink(P[Y<=1]), logitlink(P[Y<=2])
##
## Residual deviance: 489.8513 on 465 degrees of freedom
##
## Log-likelihood: -244.9256 on 465 degrees of freedom
##
## Number of Fisher scoring iterations: 14
##
## No Hauck-Donner effect found in any of the estimates
##
##
## Exponentiated coefficients:
## MC1R AKT1 PAK2 SOS1 B2M sex age_c CD86
## 0.7294869 1.3309179 1.2837450 1.0672736 1.1005435 0.8153270 0.9878635 0.7232542
## DDX58 KLRD1 TLR4
## 1.2646165 1.0319651 0.9427350
model6 <- vglm(pathStage ~ MC1R + AKT1 + PAK2 + SOS1 + B2M + sex + age_c + CD86 + DDX58 + KLRD1 + TLR4
+ age_c:CD86 + age_c:DDX58 + age_c:KLRD1 + age_c:TLR4 , data = dd1, family = cumulative(parallel = T))
## Warning in eval(slot(family, "initialize")): response should be ordinal---see
## ordered()
summary(model6)
##
## Call:
## vglm(formula = pathStage ~ MC1R + AKT1 + PAK2 + SOS1 + B2M +
## sex + age_c + CD86 + DDX58 + KLRD1 + TLR4 + age_c:CD86 +
## age_c:DDX58 + age_c:KLRD1 + age_c:TLR4, family = cumulative(parallel = T),
## data = dd1)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept):1 -1.227502 0.164555 -7.460 8.68e-14 ***
## (Intercept):2 0.351872 0.146671 2.399 0.016437 *
## MC1R -0.378942 0.146742 -2.582 0.009812 **
## AKT1 0.277384 0.130332 2.128 0.033313 *
## PAK2 0.249584 0.109425 2.281 0.022556 *
## SOS1 0.071908 0.103761 0.693 0.488302
## B2M 0.137753 0.141915 0.971 0.331713
## sex -0.251573 0.195706 -1.285 0.198630
## age_c -0.019616 0.006452 -3.040 0.002365 **
## CD86 -0.233614 0.171843 -1.359 0.173999
## DDX58 0.329202 0.117563 2.800 0.005107 **
## KLRD1 -0.091329 0.162974 -0.560 0.575212
## TLR4 -0.113594 0.126943 -0.895 0.370871
## age_c:CD86 0.021738 0.013329 1.631 0.102925
## age_c:DDX58 -0.035834 0.010462 -3.425 0.000614 ***
## age_c:KLRD1 -0.028273 0.010840 -2.608 0.009104 **
## age_c:TLR4 -0.011448 0.011069 -1.034 0.301021
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Names of linear predictors: logitlink(P[Y<=1]), logitlink(P[Y<=2])
##
## Residual deviance: 474.2647 on 461 degrees of freedom
##
## Log-likelihood: -237.1323 on 461 degrees of freedom
##
## Number of Fisher scoring iterations: 15
##
## No Hauck-Donner effect found in any of the estimates
##
##
## Exponentiated coefficients:
## MC1R AKT1 PAK2 SOS1 B2M sex
## 0.6845854 1.3196733 1.2834914 1.0745560 1.1476919 0.7775766
## age_c CD86 DDX58 KLRD1 TLR4 age_c:CD86
## 0.9805748 0.7916670 1.3898589 0.9127171 0.8926203 1.0219761
## age_c:DDX58 age_c:KLRD1 age_c:TLR4
## 0.9648000 0.9721226 0.9886169